Association Rules

Use the online browsing behavior dataset "browsing.txt". Each line represents a browsing session of a customer. On each line, each string of 8 characters represents the id of an item browsed during that session. The items are separated by spaces.

List the top 15 rules with corresponding confidence scores in decreasing order of confidence score for itemsets of size 2. Then list the top 15 rules with corresponding confidence scores in decreasing order of confidence score for itemsets of size 3. A rule is of the form: (item1, item2) ⇒ item3.


In [1]:
from __future__ import division
import itertools
import operator
from sys import argv

In [2]:
support = 99
mappings = []
itemCounts = []  
transactions = 0

Count all items


In [3]:
data = open("browsing.txt","r")
for basket in data:
    #print basket
    transactions += 1
    for item in set(basket.split()):
        #print item
        if item not in mappings:
            mappings.append(item)
            itemCounts.append(1)
        else:
            indexItem = mappings.index(item)
            counter = itemCounts[indexItem]
            counter += 1
            itemCounts[indexItem] = counter

data.close()

Get frequent items


In [4]:
frequentItems = [mappings.index(item) for item in mappings \
                 if itemCounts[mappings.index(item)] > support]

Get all candidate pairs (all combination pairs of frequent items).


In [5]:
candidatePairs = {}
for pair in itertools.combinations(sorted(frequentItems),2):
    candidatePairs[pair] = 0

Get counts for all candidate pairs.


In [6]:
data = open("browsing.txt","r")

for basket in data:
    fitems  = sorted( [ mappings.index(item) for item in set(basket.split()) ] )
    # Generate pairs for them and update counts
    for pair in itertools.combinations(fitems,2):
        if pair in candidatePairs:
            count = candidatePairs[pair]
            count += 1
            candidatePairs[pair] = count

data.close()

Get all frequent pairs


In [7]:
frequentPairs = sorted([k for k,v in candidatePairs.iteritems() if v > support])

Generate candidate triples by frequentPairs JOIN frequentPairs


In [8]:
candidateTriples = {}
allCandidateTriples = []
for fcPair in frequentPairs:
    for jp in [joinPair for joinPair in frequentPairs \
              if joinPair[0] == fcPair[1]]:
        allCandidateTriples.append( (fcPair[0],fcPair[1],jp[1]) )

Prune non frequent candidate triples


In [9]:
for candidate in allCandidateTriples:
    whatAboutIt = True
    for pair in itertools.combinations(candidate,2):
        if pair not in frequentPairs:
            whatAboutIt = False
            break
    if whatAboutIt:
        candidateTriples[candidate] = 0

Get count for candidate triples


In [10]:
data = open("browsing.txt","r")

for basket in data:
    items = sorted([mappings.index(item) for item in set(basket.split())])
    
    fPair = []
    for triple in itertools.combinations(items,3):
        if triple in candidateTriples:
            tripleCount = candidateTriples[triple] 
            tripleCount = tripleCount +1
            candidateTriples[triple] = tripleCount

data.close()

Get frequent triples


In [11]:
frequentTriples = sorted ([k for k,v in candidateTriples.iteritems() if v > support])

Generating Rules for confidence


In [12]:
def confidence(I,J):
    # Calculate P(IJ)
    PIJ = 0
    
    IJ = set(I).union(set(J))
    
    if len(IJ) == 2:
        PIJ = candidatePairs[tuple(sorted(IJ))]
    elif len(IJ) == 3:
        PIJ = candidateTriples[tuple(sorted(IJ))]
    
    #Calculate P(I)
    PI = 0
    if len(I) == 1:
        PI = itemCounts[I[0]]
    elif len(I) == 2:
        PI = candidatePairs[tuple(sorted(I))]
    if PIJ > PI:
        print I, J, IJ
        print PIJ, PI, PIJ / PI
    
    return PIJ / PI

Frequent pairs by confidence


In [13]:
pairRules = {}
for pair in frequentPairs:
    pairRules[pair]=confidence( (pair[0],),(pair[1],) )
    pairRules[(pair[1],pair[0])] = confidence( (pair[1],),(pair[0],) )

Frequent triples by confidence


In [14]:
tripleRules = {}
for triple in frequentTriples:
    for pair in itertools.combinations(triple,2):
        item2 = tuple(set(triple).difference(set(pair)))
        tripleRules[(pair,item2)] = confidence(pair,item2)

Final sort rules and get top 15 desc


In [15]:
cp = sorted(pairRules.iteritems(), key = operator.itemgetter(1))
cp.reverse()
cp5 = [ "%s-->%s  %s" % (mappings[rule[0][0]],mappings[rule[0][1]],rule[1])\
                                   for rule in cp[0:15] ]
print 'Top 15 pairs by confidence:'
print "\n".join(cp5)


Top 15 pairs by confidence:
DAI93865-->FRO40251  1.0
GRO85051-->FRO40251  0.999176276771
GRO38636-->FRO40251  0.990654205607
ELE12951-->FRO40251  0.990566037736
DAI88079-->FRO40251  0.986725663717
FRO92469-->FRO40251  0.983510011779
DAI43868-->SNA82528  0.972972972973
DAI23334-->DAI62779  0.954545454545
ELE92920-->DAI62779  0.732664995823
DAI53152-->FRO40251  0.717948717949
SNA18336-->DAI62779  0.713681241185
ELE55848-->GRO32086  0.709459459459
GRO89004-->ELE25077  0.698051948052
GRO81647-->GRO73461  0.677551020408
DAI37288-->ELE32164  0.646408839779

In [16]:
ct = sorted(tripleRules.iteritems(), key = operator.itemgetter(1))
ct.reverse()
ct5 = [ "{%s,%s}-->%s  %s" % (mappings[rule[0][0][0]],   \
                              mappings[rule[0][0][1]], \
                              mappings[rule[0][1][0]], \
                              rule[1])\
                            for rule in ct[0:15] ]
print 'Top 15 triples by confidence:'
print "\n".join(ct5)


Top 15 triples by confidence:
{ELE17451,GRO85051}-->FRO40251  1.0
{FRO92469,ELE20847}-->FRO40251  1.0
{ELE92920,DAI23334}-->DAI62779  1.0
{GRO85051,SNA80324}-->FRO40251  1.0
{GRO85051,GRO21487}-->FRO40251  1.0
{GRO85051,FRO53271}-->FRO40251  1.0
{DAI55911,GRO85051}-->FRO40251  1.0
{GRO85051,DAI31081}-->FRO40251  1.0
{SNA45677,GRO85051}-->FRO40251  1.0
{GRO73461,GRO85051}-->FRO40251  1.0
{GRO85051,DAI75645}-->FRO40251  1.0
{ELE26917,GRO85051}-->FRO40251  1.0
{GRO85051,ELE20847}-->FRO40251  1.0
{GRO85051,GRO38814}-->FRO40251  1.0
{DAI62779,DAI88079}-->FRO40251  1.0

Generating Rules for lift


In [17]:
def lift(J,conf):
    if isinstance(J, tuple):
        suppJ = itemCounts[J[0]]
    else:
        suppJ = itemCounts[J]
        
    SJ = suppJ / transactions
    return conf / SJ

In [18]:
liftedPairRules = { k:lift(k[1],v) for k,v in pairRules.iteritems()}
lp = sorted(liftedPairRules.iteritems(), key = operator.itemgetter(1))
lp.reverse()
lp5 = [ "%s-->%s  %s" % (mappings[rule[0][0]],mappings[rule[0][1]],rule[1])\
                                   for rule in lp[0:15] ]  
print 'Top 15 pairs by lift:'
print "\n".join(lp5)


Top 15 pairs by lift:
SNA44451-->DAI18527  67.6397014925
DAI18527-->SNA44451  67.6397014925
SNA82528-->DAI43868  50.9434889435
DAI43868-->SNA82528  50.9434889435
FRO17734-->ELE28189  50.1027877645
ELE28189-->FRO17734  50.1027877645
GRO30912-->ELE88583  47.4100609756
ELE88583-->GRO30912  47.4100609756
GRO89004-->ELE25077  45.4186477748
ELE25077-->GRO89004  45.4186477748
DAI94679-->DAI83031  44.607616955
DAI83031-->DAI94679  44.607616955
FRO81176-->DAI46755  38.4877963126
DAI46755-->FRO81176  38.4877963126
ELE88583-->SNA24799  35.7456808943

In [19]:
liftedTripleRules = { k:lift(k[1],v) for k,v in tripleRules.iteritems()}
lt = sorted(liftedTripleRules.iteritems(), key = operator.itemgetter(1))
lt.reverse()

lt5 = [ "{%s,%s}-->%s  %s" % (mappings[rule[0][0][0]],   \
                              mappings[rule[0][0][1]], \
                              mappings[rule[0][1][0]], \
                              rule[1])\
                            for rule in lt[0:15] ]
print 'Top 15 triples by lift:'
print "\n".join(lt5)


Top 15 triples by lift:
{SNA93860,FRO19221}-->SNA53220  40.4831468372
{DAI62779,DAI92600}-->DAI42083  35.3443405416
{ELE17451,DAI92600}-->DAI42083  34.1711460446
{ELE92920,DAI85309}-->SNA18336  30.3351998821
{DAI62779,DAI42083}-->DAI92600  29.8515014397
{ELE17451,ELE92920}-->SNA18336  26.0454425247
{GRO94758,SNA45677}-->ELE78169  22.4115535003
{ELE17451,SNA18336}-->ELE92920  22.1872659176
{SNA18336,DAI62779}-->ELE92920  22.1826503016
{DAI62779,ELE92920}-->SNA18336  21.6078855825
{SNA93860,FRO85978}-->ELE59028  21.4369685704
{DAI62779,SNA53220}-->FRO19221  21.2527177315
{ELE17451,DAI42083}-->DAI92600  21.0366642578
{SNA18336,DAI85309}-->ELE92920  20.8760774769
{DAI62779,ELE21353}-->FRO19221  20.117076326

Generating Rules for conviction


In [20]:
def conv(J,conf):
    if isinstance(J, tuple):
        suppJ = itemCounts[J[0]]
    else:
        suppJ = itemCounts[J]
        
    SJ = suppJ / transactions
    
    conv = float('inf')
    if not conf == 1: 
        conv = (1 - SJ)/(1 - conf)
    
    return conv

In [21]:
convictedPairRules = { k:conv(k[1],v) for k,v in pairRules.iteritems()}
convp = sorted(convictedPairRules.iteritems(), key = operator.itemgetter(1))
convp.reverse()
convp5 = [ "%s-->%s  %s" % (mappings[rule[0][0]],mappings[rule[0][1]],rule[1])\
                                   for rule in convp[0:15] ]  
print 'Top 15 pairs by conviction:'
print "\n".join(convp5)


Top 15 pairs by conviction:
DAI93865-->FRO40251  inf
GRO85051-->FRO40251  1062.50860101
GRO38636-->FRO40251  93.6477926755
ELE12951-->FRO40251  92.7725796598
DAI88079-->FRO40251  65.9327138463
FRO92469-->FRO40251  53.0754178782
DAI43868-->SNA82528  36.2933346195
DAI23334-->DAI62779  17.2839458538
ELE55848-->GRO32086  3.34712934528
GRO89004-->ELE25077  3.26092754339
DAI53152-->FRO40251  3.10302796461
ELE92920-->DAI62779  2.93876181634
SNA18336-->DAI62779  2.74391348194
GRO81647-->GRO73461  2.74208896372
SNA18336-->ELE92920  2.68390691542

In [22]:
convictedTripleRules = { k:conv(k[1],v) for k,v in tripleRules.iteritems()}
convt = sorted(convictedTripleRules.iteritems(), key = operator.itemgetter(1))
convt.reverse()

convt5 = [ "{%s,%s}-->%s  %s" % (mappings[rule[0][0][0]],   \
                              mappings[rule[0][0][1]], \
                              mappings[rule[0][1][0]], \
                              rule[1])\
                            for rule in convt[0:15] ]
print 'Top 15 triples by conviction:'
print "\n".join(convt5)


Top 15 triples by conviction:
{GRO85051,SNA80324}-->FRO40251  inf
{ELE17451,GRO85051}-->FRO40251  inf
{FRO92469,ELE20847}-->FRO40251  inf
{ELE92920,DAI23334}-->DAI62779  inf
{GRO85051,FRO53271}-->FRO40251  inf
{DAI55911,GRO85051}-->FRO40251  inf
{GRO85051,DAI31081}-->FRO40251  inf
{SNA45677,GRO85051}-->FRO40251  inf
{GRO73461,GRO85051}-->FRO40251  inf
{GRO85051,DAI75645}-->FRO40251  inf
{ELE26917,GRO85051}-->FRO40251  inf
{GRO85051,ELE20847}-->FRO40251  inf
{GRO85051,GRO38814}-->FRO40251  inf
{DAI62779,DAI88079}-->FRO40251  inf
{GRO85051,GRO21487}-->FRO40251  inf

In [ ]: